import pandas as pd
import numpy as np
from collections import Counter
from wordcloud import WordCloud
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
%matplotlib inline
#--------Movie Lens Data directories-----------
MLens = "../Data/ml-latest-small/movies.csv"
MLensLinks = "../Data/ml-latest-small/links.csv"
MLensRatings = "../Data/ml-latest-small/ratings.csv"
MLensTags = "../Data/ml-latest-small/tags.csv"
#--------imdb Data directories-----------------
IMDB = "../Data/iMDB/IMDb movies.csv"
IMDBName = "../Data/iMDB/IMDb names.csv"
IMDBRatings = "../Data/iMDB/IMDb ratings.csv"
IMDBPrincipals = "../Data/iMDB/IMDb title_principals.csv"
movieLensdf = pd.read_csv(MLens)
linksdf = pd.read_csv(MLensLinks)
mLensRatingsdf = pd.read_csv(MLensRatings)
mLensTagsdf = pd.read_csv(MLensTags)
movieLensdf.head()
| movieId | title | genres | |
|---|---|---|---|
| 0 | 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
| 1 | 2 | Jumanji (1995) | Adventure|Children|Fantasy |
| 2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
| 3 | 4 | Waiting to Exhale (1995) | Comedy|Drama|Romance |
| 4 | 5 | Father of the Bride Part II (1995) | Comedy |
movieLensdf.dtypes
movieId int64 title object genres object dtype: object
linksdf.head()
| movieId | imdbId | tmdbId | |
|---|---|---|---|
| 0 | 1 | 114709 | 862.0 |
| 1 | 2 | 113497 | 8844.0 |
| 2 | 3 | 113228 | 15602.0 |
| 3 | 4 | 114885 | 31357.0 |
| 4 | 5 | 113041 | 11862.0 |
linksdf = linksdf.drop("tmdbId",axis=1)
linksdf.dtypes
movieId int64 imdbId int64 dtype: object
linksdf.head()
| movieId | imdbId | |
|---|---|---|
| 0 | 1 | 114709 |
| 1 | 2 | 113497 |
| 2 | 3 | 113228 |
| 3 | 4 | 114885 |
| 4 | 5 | 113041 |
mLensRatingsdf.head()
| userId | movieId | rating | timestamp | |
|---|---|---|---|---|
| 0 | 1 | 1 | 4.0 | 964982703 |
| 1 | 1 | 3 | 4.0 | 964981247 |
| 2 | 1 | 6 | 4.0 | 964982224 |
| 3 | 1 | 47 | 5.0 | 964983815 |
| 4 | 1 | 50 | 5.0 | 964982931 |
mLensRatingsdf.dtypes
userId int64 movieId int64 rating float64 timestamp int64 dtype: object
mLensTagsdf.head()
| userId | movieId | tag | timestamp | |
|---|---|---|---|---|
| 0 | 2 | 60756 | funny | 1445714994 |
| 1 | 2 | 60756 | Highly quotable | 1445714996 |
| 2 | 2 | 60756 | will ferrell | 1445714992 |
| 3 | 2 | 89774 | Boxing story | 1445715207 |
| 4 | 2 | 89774 | MMA | 1445715200 |
mLensTagsdf.dtypes
userId int64 movieId int64 tag object timestamp int64 dtype: object
imdbdf = pd.read_csv(IMDB,low_memory = False)
imdbNamesdf = pd.read_csv(IMDBName,low_memory = False)
imdbRatingsdf = pd.read_csv(IMDBRatings,low_memory = False)
imdbPrincipalsdf = pd.read_csv(IMDBPrincipals,low_memory = False)
imdbdf.dtypes
imdb_title_id object title object original_title object year object date_published object genre object duration int64 country object language object director object writer object production_company object actors object description object avg_vote float64 votes int64 budget object usa_gross_income object worlwide_gross_income object metascore float64 reviews_from_users float64 reviews_from_critics float64 dtype: object
imdbdf.head()
| imdb_title_id | title | original_title | year | date_published | genre | duration | country | language | director | ... | actors | description | avg_vote | votes | budget | usa_gross_income | worlwide_gross_income | metascore | reviews_from_users | reviews_from_critics | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | tt0000009 | Miss Jerry | Miss Jerry | 1894 | 1894-10-09 | Romance | 45 | USA | None | Alexander Black | ... | Blanche Bayliss, William Courtenay, Chauncey D... | The adventures of a female reporter in the 1890s. | 5.9 | 154 | NaN | NaN | NaN | NaN | 1.0 | 2.0 |
| 1 | tt0000574 | The Story of the Kelly Gang | The Story of the Kelly Gang | 1906 | 1906-12-26 | Biography, Crime, Drama | 70 | Australia | None | Charles Tait | ... | Elizabeth Tait, John Tait, Norman Campbell, Be... | True story of notorious Australian outlaw Ned ... | 6.1 | 589 | $ 2250 | NaN | NaN | NaN | 7.0 | 7.0 |
| 2 | tt0001892 | Den sorte drøm | Den sorte drøm | 1911 | 1911-08-19 | Drama | 53 | Germany, Denmark | NaN | Urban Gad | ... | Asta Nielsen, Valdemar Psilander, Gunnar Helse... | Two men of high rank are both wooing the beaut... | 5.8 | 188 | NaN | NaN | NaN | NaN | 5.0 | 2.0 |
| 3 | tt0002101 | Cleopatra | Cleopatra | 1912 | 1912-11-13 | Drama, History | 100 | USA | English | Charles L. Gaskill | ... | Helen Gardner, Pearl Sindelar, Miss Fielding, ... | The fabled queen of Egypt's affair with Roman ... | 5.2 | 446 | $ 45000 | NaN | NaN | NaN | 25.0 | 3.0 |
| 4 | tt0002130 | L'Inferno | L'Inferno | 1911 | 1911-03-06 | Adventure, Drama, Fantasy | 68 | Italy | Italian | Francesco Bertolini, Adolfo Padovan | ... | Salvatore Papa, Arturo Pirovano, Giuseppe de L... | Loosely adapted from Dante's Divine Comedy and... | 7.0 | 2237 | NaN | NaN | NaN | NaN | 31.0 | 14.0 |
5 rows × 22 columns
imdbdf.count()
imdb_title_id 85855 title 85855 original_title 85855 year 85855 date_published 85855 genre 85855 duration 85855 country 85791 language 85022 director 85768 writer 84283 production_company 81400 actors 85786 description 83740 avg_vote 85855 votes 85855 budget 23710 usa_gross_income 15326 worlwide_gross_income 31016 metascore 13305 reviews_from_users 78258 reviews_from_critics 74058 dtype: int64
Null values:
imdbdf.apply(lambda x: sum(x.isna()),axis=0)
imdb_title_id 0 title 0 original_title 0 year 0 date_published 0 genre 0 duration 0 country 64 language 833 director 87 writer 1572 production_company 4455 actors 69 description 2115 avg_vote 0 votes 0 budget 62145 usa_gross_income 70529 worlwide_gross_income 54839 metascore 72550 reviews_from_users 7597 reviews_from_critics 11797 dtype: int64
Depending on the visualization in question, we will need to decide what to do with the null values.
corr=imdbdf.corr()
fig, ax =plt.subplots(figsize=(12,8))
sns.heatmap(corr, cmap="Reds",
xticklabels=corr.columns,
yticklabels=corr.columns,
square=False )
plt.show()
imdbNamesdf.head()
| imdb_name_id | name | birth_name | height | bio | birth_details | date_of_birth | place_of_birth | death_details | date_of_death | place_of_death | reason_of_death | spouses_string | spouses | divorces | spouses_with_children | children | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | nm0000001 | Fred Astaire | Frederic Austerlitz Jr. | 177.0 | Fred Astaire was born in Omaha, Nebraska, to J... | May 10, 1899 in Omaha, Nebraska, USA | 1899-05-10 | Omaha, Nebraska, USA | June 22, 1987 in Los Angeles, California, USA ... | 1987-06-22 | Los Angeles, California, USA | pneumonia | Robyn Smith (27 June 1980 - 22 June 1987) (hi... | 2 | 0 | 1 | 2 |
| 1 | nm0000002 | Lauren Bacall | Betty Joan Perske | 174.0 | Lauren Bacall was born Betty Joan Perske on Se... | September 16, 1924 in The Bronx, New York City... | 1924-09-16 | The Bronx, New York City, New York, USA | August 12, 2014 in New York City, New York, US... | 2014-08-12 | New York City, New York, USA | stroke | Jason Robards (4 July 1961 - 10 September 196... | 2 | 1 | 2 | 3 |
| 2 | nm0000003 | Brigitte Bardot | Brigitte Bardot | 166.0 | Brigitte Bardot was born on September 28, 1934... | September 28, 1934 in Paris, France | 1934-09-28 | Paris, France | NaN | NaN | NaN | NaN | Bernard d'Ormale (16 August 1992 - present)\n... | 4 | 3 | 1 | 1 |
| 3 | nm0000004 | John Belushi | John Adam Belushi | 170.0 | John Belushi was born in Chicago, Illinois, US... | January 24, 1949 in Chicago, Illinois, USA | 1949-01-24 | Chicago, Illinois, USA | March 5, 1982 in Hollywood, Los Angeles, Calif... | 1982-03-05 | Hollywood, Los Angeles, California, USA | acute cocaine and heroin intoxication | Judith Belushi-Pisano (31 December 1976 - 5 M... | 1 | 0 | 0 | 0 |
| 4 | nm0000005 | Ingmar Bergman | Ernst Ingmar Bergman | 179.0 | Ernst Ingmar Bergman was born July 14, 1918, t... | July 14, 1918 in Uppsala, Uppsala län, Sweden | 1918-07-14 | Uppsala, Uppsala län, Sweden | July 30, 2007 in Fårö, Gotlands län, Sweden (... | 2007-07-30 | Fårö, Gotlands län, Sweden | natural causes | Ingrid Bergman (11 November 1971 - 20 May 199... | 5 | 4 | 5 | 8 |
imdbNamesdf.dtypes
imdb_name_id object name object birth_name object height float64 bio object birth_details object date_of_birth object place_of_birth object death_details object date_of_death object place_of_death object reason_of_death object spouses_string object spouses int64 divorces int64 spouses_with_children int64 children int64 dtype: object
Null values:
Null values:
imdbNamesdf.apply(lambda x: sum(x.isna()),axis=0)
imdb_name_id 0 name 0 birth_name 0 height 253024 bio 93007 birth_details 187093 date_of_birth 187093 place_of_birth 193713 death_details 257772 date_of_death 257772 place_of_death 260667 reason_of_death 275011 spouses_string 252353 spouses 0 divorces 0 spouses_with_children 0 children 0 dtype: int64
corr=imdbNamesdf.corr()
fig, ax =plt.subplots(figsize=(12,8))
sns.heatmap(corr, cmap="Reds",
xticklabels=corr.columns,
yticklabels=corr.columns,
square=False )
plt.savefig("../Figures/imdbNames.png")
plt.show()
imdbRatingsdf.head()
| imdb_title_id | weighted_average_vote | total_votes | mean_vote | median_vote | votes_10 | votes_9 | votes_8 | votes_7 | votes_6 | ... | females_30age_avg_vote | females_30age_votes | females_45age_avg_vote | females_45age_votes | top1000_voters_rating | top1000_voters_votes | us_voters_rating | us_voters_votes | non_us_voters_rating | non_us_voters_votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | tt0000009 | 5.9 | 154 | 5.9 | 6.0 | 12 | 4 | 10 | 43 | 28 | ... | 5.7 | 13.0 | 4.5 | 4.0 | 5.7 | 34.0 | 6.4 | 51.0 | 6.0 | 70.0 |
| 1 | tt0000574 | 6.1 | 589 | 6.3 | 6.0 | 57 | 18 | 58 | 137 | 139 | ... | 6.2 | 23.0 | 6.6 | 14.0 | 6.4 | 66.0 | 6.0 | 96.0 | 6.2 | 331.0 |
| 2 | tt0001892 | 5.8 | 188 | 6.0 | 6.0 | 6 | 6 | 17 | 44 | 52 | ... | 5.8 | 4.0 | 6.8 | 7.0 | 5.4 | 32.0 | 6.2 | 31.0 | 5.9 | 123.0 |
| 3 | tt0002101 | 5.2 | 446 | 5.3 | 5.0 | 15 | 8 | 16 | 62 | 98 | ... | 5.5 | 14.0 | 6.1 | 21.0 | 4.9 | 57.0 | 5.5 | 207.0 | 4.7 | 105.0 |
| 4 | tt0002130 | 7.0 | 2237 | 6.9 | 7.0 | 210 | 225 | 436 | 641 | 344 | ... | 7.3 | 82.0 | 7.4 | 77.0 | 6.9 | 139.0 | 7.0 | 488.0 | 7.0 | 1166.0 |
5 rows × 49 columns
imdbRatingsdf.dtypes
imdb_title_id object weighted_average_vote float64 total_votes int64 mean_vote float64 median_vote float64 votes_10 int64 votes_9 int64 votes_8 int64 votes_7 int64 votes_6 int64 votes_5 int64 votes_4 int64 votes_3 int64 votes_2 int64 votes_1 int64 allgenders_0age_avg_vote float64 allgenders_0age_votes float64 allgenders_18age_avg_vote float64 allgenders_18age_votes float64 allgenders_30age_avg_vote float64 allgenders_30age_votes float64 allgenders_45age_avg_vote float64 allgenders_45age_votes float64 males_allages_avg_vote float64 males_allages_votes float64 males_0age_avg_vote float64 males_0age_votes float64 males_18age_avg_vote float64 males_18age_votes float64 males_30age_avg_vote float64 males_30age_votes float64 males_45age_avg_vote float64 males_45age_votes float64 females_allages_avg_vote float64 females_allages_votes float64 females_0age_avg_vote float64 females_0age_votes float64 females_18age_avg_vote float64 females_18age_votes float64 females_30age_avg_vote float64 females_30age_votes float64 females_45age_avg_vote float64 females_45age_votes float64 top1000_voters_rating float64 top1000_voters_votes float64 us_voters_rating float64 us_voters_votes float64 non_us_voters_rating float64 non_us_voters_votes float64 dtype: object
Null values:
imdbRatingsdf.apply(lambda x: sum(x.isna()),axis=0)
imdb_title_id 0 weighted_average_vote 0 total_votes 0 mean_vote 0 median_vote 0 votes_10 0 votes_9 0 votes_8 0 votes_7 0 votes_6 0 votes_5 0 votes_4 0 votes_3 0 votes_2 0 votes_1 0 allgenders_0age_avg_vote 52496 allgenders_0age_votes 52496 allgenders_18age_avg_vote 706 allgenders_18age_votes 706 allgenders_30age_avg_vote 10 allgenders_30age_votes 10 allgenders_45age_avg_vote 80 allgenders_45age_votes 80 males_allages_avg_vote 1 males_allages_votes 1 males_0age_avg_vote 58444 males_0age_votes 58444 males_18age_avg_vote 1465 males_18age_votes 1465 males_30age_avg_vote 12 males_30age_votes 12 males_45age_avg_vote 101 males_45age_votes 101 females_allages_avg_vote 81 females_allages_votes 81 females_0age_avg_vote 63738 females_0age_votes 63738 females_18age_avg_vote 6521 females_18age_votes 6521 females_30age_avg_vote 944 females_30age_votes 944 females_45age_avg_vote 2798 females_45age_votes 2798 top1000_voters_rating 679 top1000_voters_votes 679 us_voters_rating 209 us_voters_votes 209 non_us_voters_rating 1 non_us_voters_votes 1 dtype: int64
votes_columns = np.array([ "votes_{}".format(i) for i in (range(1,11))])
imdbRatingsdf[votes_columns].head()
| votes_1 | votes_2 | votes_3 | votes_4 | votes_5 | votes_6 | votes_7 | votes_8 | votes_9 | votes_10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14 | 5 | 1 | 9 | 28 | 28 | 43 | 10 | 4 | 12 |
| 1 | 16 | 13 | 20 | 28 | 103 | 139 | 137 | 58 | 18 | 57 |
| 2 | 4 | 6 | 5 | 16 | 32 | 52 | 44 | 17 | 6 | 6 |
| 3 | 16 | 25 | 26 | 63 | 117 | 98 | 62 | 16 | 8 | 15 |
| 4 | 87 | 20 | 39 | 66 | 169 | 344 | 641 | 436 | 225 | 210 |
fig, ax = plt.subplots(figsize=(12,8))
imdbRatingsdf[votes_columns].sum().plot.bar(ax)
ax.set_ylabel("Voters count")
ax.set_xlabel("score given")
ax.set_title("Number of voters for each score")
plt.savefig("../Figures/scoresDistribution.png")
plt.show()
corr=imdbRatingsdf.corr()
fig, ax =plt.subplots(figsize=(24,16))
sns.heatmap(corr, cmap="Reds",
xticklabels=corr.columns,
yticklabels=corr.columns,
square=False )
plt.show()
imdbPrincipalsdf.head(10)
| imdb_title_id | ordering | imdb_name_id | category | job | characters | |
|---|---|---|---|---|---|---|
| 0 | tt0000009 | 1 | nm0063086 | actress | NaN | ["Miss Geraldine Holbrook (Miss Jerry)"] |
| 1 | tt0000009 | 2 | nm0183823 | actor | NaN | ["Mr. Hamilton"] |
| 2 | tt0000009 | 3 | nm1309758 | actor | NaN | ["Chauncey Depew - the Director of the New Yor... |
| 3 | tt0000009 | 4 | nm0085156 | director | NaN | NaN |
| 4 | tt0000574 | 1 | nm0846887 | actress | NaN | ["Kate Kelly"] |
| 5 | tt0000574 | 2 | nm0846894 | actor | NaN | ["School Master"] |
| 6 | tt0000574 | 3 | nm3002376 | actor | NaN | ["Steve Hart"] |
| 7 | tt0000574 | 4 | nm0170118 | actress | NaN | NaN |
| 8 | tt0000574 | 5 | nm0846879 | director | NaN | NaN |
| 9 | tt0000574 | 6 | nm0317210 | producer | producer | NaN |
imdbPrincipalsdf.dtypes
imdb_title_id object ordering int64 imdb_name_id object category object job object characters object dtype: object
Null values:
imdbPrincipalsdf.apply(lambda x: sum(x.isna()),axis=0)
imdb_title_id 0 ordering 0 imdb_name_id 0 category 0 job 622782 characters 494677 dtype: int64
mLensRatingsdf.head()
| userId | movieId | rating | timestamp | |
|---|---|---|---|---|
| 0 | 1 | 1 | 4.0 | 964982703 |
| 1 | 1 | 3 | 4.0 | 964981247 |
| 2 | 1 | 6 | 4.0 | 964982224 |
| 3 | 1 | 47 | 5.0 | 964983815 |
| 4 | 1 | 50 | 5.0 | 964982931 |
imdbRatingsdf.head()
| imdb_title_id | weighted_average_vote | total_votes | mean_vote | median_vote | votes_10 | votes_9 | votes_8 | votes_7 | votes_6 | ... | females_30age_avg_vote | females_30age_votes | females_45age_avg_vote | females_45age_votes | top1000_voters_rating | top1000_voters_votes | us_voters_rating | us_voters_votes | non_us_voters_rating | non_us_voters_votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | tt0000009 | 5.9 | 154 | 5.9 | 6.0 | 12 | 4 | 10 | 43 | 28 | ... | 5.7 | 13.0 | 4.5 | 4.0 | 5.7 | 34.0 | 6.4 | 51.0 | 6.0 | 70.0 |
| 1 | tt0000574 | 6.1 | 589 | 6.3 | 6.0 | 57 | 18 | 58 | 137 | 139 | ... | 6.2 | 23.0 | 6.6 | 14.0 | 6.4 | 66.0 | 6.0 | 96.0 | 6.2 | 331.0 |
| 2 | tt0001892 | 5.8 | 188 | 6.0 | 6.0 | 6 | 6 | 17 | 44 | 52 | ... | 5.8 | 4.0 | 6.8 | 7.0 | 5.4 | 32.0 | 6.2 | 31.0 | 5.9 | 123.0 |
| 3 | tt0002101 | 5.2 | 446 | 5.3 | 5.0 | 15 | 8 | 16 | 62 | 98 | ... | 5.5 | 14.0 | 6.1 | 21.0 | 4.9 | 57.0 | 5.5 | 207.0 | 4.7 | 105.0 |
| 4 | tt0002130 | 7.0 | 2237 | 6.9 | 7.0 | 210 | 225 | 436 | 641 | 344 | ... | 7.3 | 82.0 | 7.4 | 77.0 | 6.9 | 139.0 | 7.0 | 488.0 | 7.0 | 1166.0 |
5 rows × 49 columns
linksdf.head()
| movieId | imdbId | |
|---|---|---|
| 0 | 1 | 114709 |
| 1 | 2 | 113497 |
| 2 | 3 | 113228 |
| 3 | 4 | 114885 |
| 4 | 5 | 113041 |
linksdf['imdbId'] = linksdf["imdbId"].apply(lambda x: "tt{:07d}".format(x))
linksdf.head()
| movieId | imdbId | |
|---|---|---|
| 0 | 1 | tt0114709 |
| 1 | 2 | tt0113497 |
| 2 | 3 | tt0113228 |
| 3 | 4 | tt0114885 |
| 4 | 5 | tt0113041 |
mergedmovies = linksdf.merge(imdbRatingsdf, left_on="imdbId", right_on="imdb_title_id")
mergedmovies
| movieId | imdbId | imdb_title_id | weighted_average_vote | total_votes | mean_vote | median_vote | votes_10 | votes_9 | votes_8 | ... | females_30age_avg_vote | females_30age_votes | females_45age_avg_vote | females_45age_votes | top1000_voters_rating | top1000_voters_votes | us_voters_rating | us_voters_votes | non_us_voters_rating | non_us_voters_votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | tt0114709 | tt0114709 | 8.3 | 864461 | 8.3 | 8.0 | 177024 | 214560 | 273073 | ... | 8.2 | 59102.0 | 8.3 | 12842.0 | 8.1 | 839.0 | 8.6 | 147914.0 | 8.2 | 338585.0 |
| 1 | 2 | tt0113497 | tt0113497 | 7.0 | 299083 | 7.1 | 7.0 | 24313 | 22327 | 63008 | ... | 7.2 | 25960.0 | 7.2 | 4219.0 | 6.8 | 792.0 | 7.0 | 49740.0 | 6.9 | 134663.0 |
| 2 | 3 | tt0113228 | tt0113228 | 6.7 | 23742 | 6.8 | 7.0 | 2270 | 1389 | 3441 | ... | 6.9 | 1778.0 | 6.9 | 880.0 | 6.2 | 373.0 | 6.7 | 8332.0 | 6.6 | 8209.0 |
| 3 | 4 | tt0114885 | tt0114885 | 5.9 | 9353 | 6.2 | 6.0 | 1205 | 442 | 989 | ... | 6.5 | 1784.0 | 6.6 | 970.0 | 5.2 | 214.0 | 6.2 | 3824.0 | 5.3 | 2301.0 |
| 4 | 5 | tt0113041 | tt0113041 | 6.1 | 33587 | 6.2 | 6.0 | 2371 | 1370 | 3186 | ... | 6.4 | 5191.0 | 6.3 | 1188.0 | 5.5 | 382.0 | 6.1 | 9446.0 | 5.9 | 13165.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8557 | 193571 | tt3110014 | tt3110014 | 6.7 | 286 | 6.8 | 7.0 | 28 | 19 | 50 | ... | 6.8 | 33.0 | 8.3 | 3.0 | 8.0 | 6.0 | 7.4 | 29.0 | 6.6 | 154.0 |
| 8558 | 193573 | tt3837248 | tt3837248 | 7.2 | 455 | 7.7 | 8.0 | 172 | 31 | 72 | ... | 7.7 | 10.0 | 4.0 | 4.0 | 6.4 | 18.0 | 7.5 | 55.0 | 7.0 | 104.0 |
| 8559 | 193581 | tt5476944 | tt5476944 | 7.7 | 847 | 8.1 | 8.0 | 291 | 110 | 179 | ... | 8.0 | 50.0 | 8.1 | 13.0 | 6.2 | 16.0 | 7.8 | 99.0 | 7.4 | 259.0 |
| 8560 | 193583 | tt5914996 | tt5914996 | 7.5 | 2504 | 7.9 | 8.0 | 699 | 375 | 553 | ... | 7.1 | 33.0 | 5.8 | 13.0 | 5.5 | 28.0 | 7.1 | 232.0 | 7.4 | 757.0 |
| 8561 | 193587 | tt8391976 | tt8391976 | 7.2 | 521 | 7.5 | 8.0 | 109 | 45 | 113 | ... | 7.8 | 33.0 | 7.3 | 3.0 | 6.4 | 10.0 | 7.1 | 46.0 | 6.8 | 183.0 |
8562 rows × 51 columns
mergedDF = mLensRatingsdf.merge(mergedmovies, left_on="movieId", right_on="movieId")
mergedDF.head()
| userId | movieId | rating | timestamp | imdbId | imdb_title_id | weighted_average_vote | total_votes | mean_vote | median_vote | ... | females_30age_avg_vote | females_30age_votes | females_45age_avg_vote | females_45age_votes | top1000_voters_rating | top1000_voters_votes | us_voters_rating | us_voters_votes | non_us_voters_rating | non_us_voters_votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 4.0 | 964982703 | tt0114709 | tt0114709 | 8.3 | 864461 | 8.3 | 8.0 | ... | 8.2 | 59102.0 | 8.3 | 12842.0 | 8.1 | 839.0 | 8.6 | 147914.0 | 8.2 | 338585.0 |
| 1 | 5 | 1 | 4.0 | 847434962 | tt0114709 | tt0114709 | 8.3 | 864461 | 8.3 | 8.0 | ... | 8.2 | 59102.0 | 8.3 | 12842.0 | 8.1 | 839.0 | 8.6 | 147914.0 | 8.2 | 338585.0 |
| 2 | 7 | 1 | 4.5 | 1106635946 | tt0114709 | tt0114709 | 8.3 | 864461 | 8.3 | 8.0 | ... | 8.2 | 59102.0 | 8.3 | 12842.0 | 8.1 | 839.0 | 8.6 | 147914.0 | 8.2 | 338585.0 |
| 3 | 15 | 1 | 2.5 | 1510577970 | tt0114709 | tt0114709 | 8.3 | 864461 | 8.3 | 8.0 | ... | 8.2 | 59102.0 | 8.3 | 12842.0 | 8.1 | 839.0 | 8.6 | 147914.0 | 8.2 | 338585.0 |
| 4 | 17 | 1 | 4.5 | 1305696483 | tt0114709 | tt0114709 | 8.3 | 864461 | 8.3 | 8.0 | ... | 8.2 | 59102.0 | 8.3 | 12842.0 | 8.1 | 839.0 | 8.6 | 147914.0 | 8.2 | 338585.0 |
5 rows × 54 columns
mergedDF.columns
Index(['userId', 'movieId', 'rating', 'timestamp', 'imdbId', 'imdb_title_id',
'weighted_average_vote', 'total_votes', 'mean_vote', 'median_vote',
'votes_10', 'votes_9', 'votes_8', 'votes_7', 'votes_6', 'votes_5',
'votes_4', 'votes_3', 'votes_2', 'votes_1', 'allgenders_0age_avg_vote',
'allgenders_0age_votes', 'allgenders_18age_avg_vote',
'allgenders_18age_votes', 'allgenders_30age_avg_vote',
'allgenders_30age_votes', 'allgenders_45age_avg_vote',
'allgenders_45age_votes', 'males_allages_avg_vote',
'males_allages_votes', 'males_0age_avg_vote', 'males_0age_votes',
'males_18age_avg_vote', 'males_18age_votes', 'males_30age_avg_vote',
'males_30age_votes', 'males_45age_avg_vote', 'males_45age_votes',
'females_allages_avg_vote', 'females_allages_votes',
'females_0age_avg_vote', 'females_0age_votes', 'females_18age_avg_vote',
'females_18age_votes', 'females_30age_avg_vote', 'females_30age_votes',
'females_45age_avg_vote', 'females_45age_votes',
'top1000_voters_rating', 'top1000_voters_votes', 'us_voters_rating',
'us_voters_votes', 'non_us_voters_rating', 'non_us_voters_votes'],
dtype='object')
imdbdf_clean = imdbdf[['title','year', 'genre', 'duration', 'country', 'director', 'writer', 'production_company', 'actors', 'avg_vote']] #keep the wanted column
imdbdf_clean
| title | year | genre | duration | country | director | writer | production_company | actors | avg_vote | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Miss Jerry | 1894 | Romance | 45 | USA | Alexander Black | Alexander Black | Alexander Black Photoplays | Blanche Bayliss, William Courtenay, Chauncey D... | 5.9 |
| 1 | The Story of the Kelly Gang | 1906 | Biography, Crime, Drama | 70 | Australia | Charles Tait | Charles Tait | J. and N. Tait | Elizabeth Tait, John Tait, Norman Campbell, Be... | 6.1 |
| 2 | Den sorte drøm | 1911 | Drama | 53 | Germany, Denmark | Urban Gad | Urban Gad, Gebhard Schätzler-Perasini | Fotorama | Asta Nielsen, Valdemar Psilander, Gunnar Helse... | 5.8 |
| 3 | Cleopatra | 1912 | Drama, History | 100 | USA | Charles L. Gaskill | Victorien Sardou | Helen Gardner Picture Players | Helen Gardner, Pearl Sindelar, Miss Fielding, ... | 5.2 |
| 4 | L'Inferno | 1911 | Adventure, Drama, Fantasy | 68 | Italy | Francesco Bertolini, Adolfo Padovan | Dante Alighieri | Milano Film | Salvatore Papa, Arturo Pirovano, Giuseppe de L... | 7.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 85850 | Le lion | 2020 | Comedy | 95 | France, Belgium | Ludovic Colbeau-Justin | Alexandre Coquelle, Matthieu Le Naour | Monkey Pack Films | Dany Boon, Philippe Katerine, Anne Serra, Samu... | 5.3 |
| 85851 | De Beentjes van Sint-Hildegard | 2020 | Comedy, Drama | 103 | Netherlands | Johan Nijenhuis | Radek Bajgar, Herman Finkers | Johan Nijenhuis & Co | Herman Finkers, Johanna ter Steege, Leonie ter... | 7.7 |
| 85852 | Padmavyuhathile Abhimanyu | 2019 | Drama | 130 | India | Vineesh Aaradya | Vineesh Aaradya, Vineesh Aaradya | RMCC Productions | Anoop Chandran, Indrans, Sona Nair, Simon Brit... | 7.9 |
| 85853 | Sokagin Çocuklari | 2019 | Drama, Family | 98 | Turkey | Ahmet Faik Akinci | Ahmet Faik Akinci, Kasim Uçkan | Gizem Ajans | Ahmet Faik Akinci, Belma Mamati, Metin Keçeci,... | 6.4 |
| 85854 | La vida sense la Sara Amat | 2019 | Drama | 74 | Spain | Laura Jou | Coral Cruz, Pep Puig | La Xarxa de Comunicació Local | Maria Morera Colomer, Biel Rossell Pelfort, Is... | 6.7 |
85855 rows × 10 columns
Remove the row where the value for column 'year' is 'TV Movie 2019'
#imdbdf_clean[imdbdf_clean["year"] == 'TV Movie 2019']["year"].apply(lambda x: x[-4:])
imdbdf_clean = imdbdf_clean[imdbdf_clean.year != 'TV Movie 2019']
imdbdf_clean['year'] = imdbdf_clean['year'].astype('int')
imdbdf_clean.isna().sum() #find all the NaN values
title 0 year 0 genre 0 duration 0 country 64 director 87 writer 1572 production_company 4455 actors 69 avg_vote 0 dtype: int64
imdbdf_clean = imdbdf_clean.dropna() #drop all the rows with NaN values
imdbdf_clean.isna().sum()
title 0 year 0 genre 0 duration 0 country 0 director 0 writer 0 production_company 0 actors 0 avg_vote 0 dtype: int64
Separate the countries so we can analyze them separately. This piece of code will be used later on (in the section "top countries creating movies").
countries = {} #create empty dictionary
list_countries = list(imdbdf_clean['country']) #set up a list of countries
for i in list_countries:
i = list(i.split(',')) #split countries separated by commas
if len(i) == 1: #if 1 country in countries.keys()
if i in list(countries.keys()): #countries.keys() is the country name, countries.values() is the count of country name
countries[i] +=1 #count
else:
countries[i[0]] = 0
else:
for j in i: #does the same but for len(i) != 1
if j in list(countries.keys()):
countries[j] += 1 #count
else:
countries[j] =1
Separate the genre so we can analyze them separately This piece of code will be used later on.
genre = list(imdbdf_clean['genre'])
genre_list = [] #create an empty list
for i in genre:
i = list(i.split(',')) #split words when comma
for j in i:
genre_list.append(j.replace(' ', '')) #replace extra space
genre_counter = Counter(genre_list) #We'll need it after
imdbdf_clean.describe()
| year | duration | avg_vote | |
|---|---|---|---|
| count | 80161.000000 | 80161.000000 | 80161.000000 |
| mean | 1992.965644 | 100.016342 | 5.913369 |
| std | 24.551436 | 22.023683 | 1.216666 |
| min | 1894.000000 | 41.000000 | 1.000000 |
| 25% | 1977.000000 | 88.000000 | 5.300000 |
| 50% | 2002.000000 | 96.000000 | 6.100000 |
| 75% | 2013.000000 | 108.000000 | 6.800000 |
| max | 2020.000000 | 808.000000 | 9.800000 |
Distribution per year:
#groupby year and count how many title we have each year
title = imdbdf_clean.groupby('year').agg({'title': ['count']})
title.columns = ['Title Count']
title = title.sort_values('Title Count', ascending = False)
title.head(5)
| Title Count | |
|---|---|
| year | |
| 2017 | 3010 |
| 2018 | 2957 |
| 2016 | 2895 |
| 2015 | 2742 |
| 2014 | 2675 |
#visualize with a bar graph in descending title count order
fig_dim = (25,12)
fig, ax = plt.subplots(figsize = fig_dim)
sns.countplot(x = 'year', data = imdbdf_clean, order = imdbdf_clean['year'].value_counts().index)
plt.title('Title count per year, descending sorting per number of title released')
plt.xticks(rotation=90)
plt.show()
#visualize with a plot graph in ascending year order
fig_dim = (25,12)
fig, ax = plt.subplots(figsize = fig_dim)
ax = sns.countplot(x = 'year', data = imdbdf_clean)
plt.title('Title count per year')
#display the count on the bar graph
for p in ax.patches:
ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x(), p.get_height() + 20), rotation=90)
plt.xticks(rotation=90)
plt.show()
Get the growth year to year (in %):
title
| Title Count | |
|---|---|
| year | |
| 2017 | 3010 |
| 2018 | 2957 |
| 2016 | 2895 |
| 2015 | 2742 |
| 2014 | 2675 |
| ... | ... |
| 1913 | 11 |
| 1912 | 5 |
| 1911 | 4 |
| 1906 | 1 |
| 1894 | 1 |
112 rows × 1 columns
title = title.sort_index() #sort by year. The year is the index
growth = title.pct_change() #calculate the %growth year by year
growth.columns = ['% growth']
plt.figure(figsize = (15, 8))
x = growth['% growth']
y = growth.index
plt.bar(y, x)
plt.plot(y, x, color = 'red', linewidth = 2.0)
plt.title('Movie growth per year')
plt.xlabel('Year')
plt.ylabel('% growth')
plt.show()
Get the top rated movies:
Top 10 rated movie with the year and the country
All time best movies:
imdbdf_clean = imdbdf_clean.sort_values(by = 'avg_vote', ascending = False)
rating = imdbdf_clean[0:10] #get the 1st 10 values
fig = px.sunburst(rating, path = ['year', 'country', 'title'], values = 'avg_vote', color = 'avg_vote')
fig.show()
Let's consider the best movies of this century:
rating = imdbdf_clean[imdbdf_clean.year > 1999][0:10] #get the 1st 10 values
fig = px.sunburst(rating, path = ['year', 'country', 'title'], values = 'avg_vote', color = 'avg_vote')
fig.show()
Let's see which year was the best year for movies production:
best = imdbdf_clean.groupby('year').agg({'avg_vote': ['mean']})
best.columns = ['Average rating']
best = best.sort_values('Average rating', ascending = False)
best.head()
| Average rating | |
|---|---|
| year | |
| 1923 | 6.812121 |
| 1924 | 6.790196 |
| 1928 | 6.778313 |
| 1926 | 6.698592 |
| 1921 | 6.697872 |
plt.figure(figsize = (15, 8))
x = best['Average rating'].round(decimals = 3)
y = best.index
plt.bar(y, x)
plt.title('Average rating per year')
plt.xlabel('Year')
plt.ylabel('Average rating (on a scale of 10)')
plt.show()
best = best[best.index >=2000]
plt.figure(figsize = (15, 8))
x = best['Average rating'].round(decimals = 3)
y = best.index
plt.bar(y, x)
plt.title('Average rating per year')
plt.xlabel('Year')
plt.ylabel('Average rating (on a scale of 10)')
plt.show()
Top countries creating movies:
countries_fin = {} #create an empty dictionary
for country, no in countries.items():
country = country.replace(' ', '') #remove the extra space create by the split(',')
if country in list(countries_fin.keys()):
countries_fin[country] += no
else:
countries_fin[country] = no
#sort the country according to their count
countries_fin = {k: v for k, v in sorted(countries_fin.items(), key = lambda item: item[1], reverse= True)}
plt.figure(figsize = (8,8))
ax = sns.barplot(x = list(countries_fin.keys())[0:10], y = list(countries_fin.values())[0:10])
plt.title('Top 10 countries creating movies')
plt.xticks(rotation=45)
#display the count on the bar graph
for p in ax.patches:
ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x() + 0.1, p.get_height() + 20))
plt.show()
Let's see how each country conributes in the volume of all movies produced:
#create the dataframe
df_countries = pd.DataFrame(list(countries_fin.items()), columns = ['Country', 'Country count'])
#plot a pie chart of the distribution of each country per region
fig,ax = plt.subplots()
x = df_countries['Country count'][:10] #get the top 10 countries
labels = df_countries['Country'][:10] #get the top 10 countries
ax.pie(x, labels = labels, radius = 2)
#create a white circle at the center of the pie to create a donut chart
my_circle = plt.Circle( (0,0), 0.3, color = 'white')
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.show()
Genre:
genres = {k: v for k, v in sorted(genre_counter.items(), key=lambda item: item[1], reverse= True)}
genres
{'Drama': 44166,
'Comedy': 27475,
'Romance': 13233,
'Action': 12084,
'Thriller': 10706,
'Crime': 10541,
'Horror': 8896,
'Adventure': 7302,
'Mystery': 4933,
'Family': 3723,
'Fantasy': 3585,
'Sci-Fi': 3411,
'Biography': 2314,
'History': 2190,
'War': 2132,
'Animation': 1994,
'Musical': 1907,
'Music': 1620,
'Western': 1556,
'Sport': 1025,
'Film-Noir': 661,
'Reality-TV': 2,
'Documentary': 1,
'News': 1}
fig_dim = (15,10)
fig, ax = plt.subplots(figsize = fig_dim)
x = list(genres.keys())
y = list(genres.values())
ax.vlines(x, ymin = 0, ymax = y, linewidth = 8)
plt.xticks(rotation = 90)
plt.show()
#create list of genre (unique genre)
text = list(genres.keys())
#create the wordcloud object
wordcloud = WordCloud(width = 500, height = 500, max_words = 100000, background_color = 'white').generate(str(text))
#Display the generated image
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.margins(x = 0, y = 0)
plt.show()
Exploring Plotly:
config = {'responsive': False}
genres_df = pd.DataFrame({"Genre":genres.keys(),"number_of_movies":genres.values()})
genres_bar = px.bar(genres_df[:10],
x = 'Genre',
y = 'number_of_movies',
title = 'Top 10 Genres in terms of number of movies',
text = 'number_of_movies',
labels = dict(genre = 'Genre', number_of_movies = 'Number of movies'),
color = 'Genre')
genres_bar.show()
years_df = imdbdf_clean[['year','title', 'genre']].groupby(['year','genre']).count().reset_index().rename(columns={'title':'number_of_movies'})
years_df = years_df.sort_values(by=['number_of_movies'], ascending=False)
years_df = years_df[years_df['year'] >= 2000]#Movies of this century
years_df = years_df[years_df['number_of_movies'] >= 100]#years where the number of movies is over 100
detailed_genres_bar = px.bar(years_df,
x ='year',
y = 'number_of_movies',
color = 'genre',
title='Movies produced in the 21st century classified by genres',
text = 'number_of_movies',
labels = dict(year = 'Year', number_of_movies = 'Number of movies')
)
detailed_genres_bar.show()
Producction Companies:
production_companies_df = imdbdf_clean[['production_company','title']].groupby(['production_company']).count().reset_index().rename(columns={'title':'number_of_movies'})
production_companies_df = production_companies_df.sort_values(by='number_of_movies', ascending=False)
production_companies_df = production_companies_df.iloc[:10]
production_companies_df
| production_company | number_of_movies | |
|---|---|---|
| 20624 | Metro-Goldwyn-Mayer (MGM) | 1283 |
| 30637 | Warner Bros. | 1152 |
| 8179 | Columbia Pictures | 913 |
| 23166 | Paramount Pictures | 903 |
| 29674 | Twentieth Century Fox | 864 |
| 29942 | Universal Pictures | 732 |
| 24630 | RKO Radio Pictures | 535 |
| 21216 | Mosfilm | 278 |
| 29938 | Universal International Pictures (UI) | 272 |
| 6402 | Canal+ | 231 |
detailed_genres_bar = px.bar(production_companies_df,
x ='production_company',
y = 'number_of_movies',
color = 'production_company',
title='Top 10 production companies',
text = 'number_of_movies',
labels = dict(production_company = 'Production Company', number_of_movies = 'Number of movies')
)
detailed_genres_bar.show()
If we choose only this century:
production_companies_df = imdbdf_clean[imdbdf_clean.year > 1999][['production_company','title']].groupby(['production_company']).count().reset_index().rename(columns={'title':'number_of_movies'})
production_companies_df = production_companies_df.sort_values(by='number_of_movies', ascending=False)
production_companies_df = production_companies_df.iloc[:10]
detailed_genres_bar = px.bar(production_companies_df,
x ='production_company',
y = 'number_of_movies',
color = 'production_company',
title='Top 10 production companies',
text = 'number_of_movies',
labels = dict(production_company = 'Production Company', number_of_movies = 'Number of movies')
)
detailed_genres_bar.show()